Set up
suppressPackageStartupMessages({
library(tidyverse)
})
Read in data and process
pbta_df <- readr::read_tsv(pbta_file, guess_max = 100000, show_col_types = FALSE) %>%
select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, cg_multiple, cg_id, cgGFAC, tumor_descriptor)
tmb_vaf_df <- readr::read_tsv(tmb_vaf_file, guess_max = 100000, show_col_types = FALSE) %>%
filter(!tmb >= 10) %>%
select(Kids_First_Biospecimen_ID, Variant_Classification, gene_protein, mutation_count, region_size, tmb, VAF)
genomic_paired_df <- readr::read_tsv(genomic_paired_file, guess_max = 100000, show_col_types = FALSE) %>%
left_join(pbta_df, by = c("Kids_First_Participant_ID")) %>%
left_join(tmb_vaf_df, by = c("Kids_First_Biospecimen_ID")) %>%
filter(!is.na(tmb))
# Attention as some bs specimen might not have TMB!
# If that happens, we will end up with samples lacking timepoints.
# Which patient samples don't have TMB?
# genomic_paired_df %>%
# filter(is.na(tmb)) %>%
# unique() %>%
# regulartable() %>%
# fontsize(size = 12, part = "all")
descriptors_df <- genomic_paired_df %>%
group_by(Kids_First_Participant_ID) %>%
summarize(descriptors = paste(sort(tumor_descriptor), collapse = ", "),)
# Vector to order timepoints
timepoints <- c("Diagnosis", "Progressive", "Recurrence", "Deceased", "Second Malignancy", "Unavailable")
# Create df to use for plots
df_plot <- genomic_paired_df %>%
left_join(descriptors_df, by = c("Kids_First_Participant_ID", "descriptors")) %>%
mutate(td_cgGFAC = case_when(grepl("Deceased", tumor_descriptor) ~ "xDeceased",
TRUE ~ tumor_descriptor),
match_id = paste(tumor_descriptor, Kids_First_Participant_ID, sep = "_"),
tumor_descriptor = factor(tumor_descriptor),
tumor_descriptor = fct_relevel(tumor_descriptor, timepoints))
# Let's count number of samples
count_df <- df_plot %>%
group_by(tumor_descriptor, cg_id, match_id, Variant_Classification) %>%
dplyr::count(cg_id)
Define parameters for plots
# Read color palette
palette_df <- readr::read_tsv(palette_file, guess_max = 100000, show_col_types = FALSE)
# Define and order palette
palette <- palette_df$hex_codes
names(palette) <- palette_df$color_names
Alterations per timepoint
# Define parameters for function
x_value <- count_df$tumor_descriptor
title <- paste("Variant types in PBTA cohort", sep = " ")
# Run function
fname <- paste0(plots_dir, "/", "Alteration_type_timepoints_barplots.pdf")
print(fname)
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/Alteration_type_timepoints_barplots.pdf"
p <- create_stacked_barplot_variant(count_df = count_df, x = x_value, palette = palette, title = title)
pdf(file = fname, width = 6, height = 6)
print(p)
dev.off()
quartz_off_screen
2

Alterations per timepoint in each cancer type
cg_id_id <- as.character(unique(count_df$cg_id))
cg_id_id <- sort(cg_id_id, decreasing = FALSE)
cg_id_id
[1] "Adamantinomatous Craniopharyngioma" "Atypical Teratoid Rhabdoid Tumor" "Chordoma"
[4] "Choroid plexus carcinoma" "CNS Embryonal tumor" "Craniopharyngioma"
[7] "Diffuse midline glioma" "Dysembryoplastic neuroepithelial tumor" "Embryonal tumor with multilayer rosettes"
[10] "Ependymoma" "Ewing sarcoma" "Ganglioglioma"
[13] "Glial-neuronal tumor" "Hemangioblastoma" "High-grade glioma"
[16] "Low-grade glioma" "Malignant peripheral nerve sheath tumor" "Medulloblastoma"
[19] "Meningioma" "Neuroblastoma" "Neurofibroma/Plexiform"
[22] "Pilocytic astrocytoma" "Rosai-Dorfman disease" "Schwannoma"
# Define parameters for function
x_value <- count_df$match_id
title <- paste("Variant types in PBTA cohort across cancer groups", sep = " ")
# Run function
fname <- paste0(plots_dir, "/", "Alteration_type_timepoints_cg_id_barplots.pdf")
print(fname)
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/Alteration_type_timepoints_cg_id_barplots.pdf"
p <- create_stacked_barplot_variant_cg_id(count_df = count_df, x = x_value, palette = palette, title = title)
pdf(file = fname, width = 30, height = 18)
print(p)
dev.off()
quartz_off_screen
2

Alterations per timepoint in each cancer type defined by cgGFAC
df_plot_cgGFAC <- df_plot %>%
group_by(tumor_descriptor, cgGFAC, Kids_First_Biospecimen_ID, Variant_Classification) %>%
dplyr::count(cgGFAC)
cgGFAC_id <- as.character(unique(df_plot_cgGFAC$cgGFAC))
cgGFAC_id <- sort(cgGFAC_id, decreasing = FALSE)
cgGFAC_id
[1] "ATRT" "DMG" "HGG" "LGG" "Other"
# Define parameters for function
x_value <- df_plot_cgGFAC$match_id
Warning: Unknown or uninitialised column: `match_id`.
title <- paste("Variant types in PBTA cohort across cgGFAC", sep = " ")
# Run function
p <- create_stacked_barplot_variant_cgGFAC(count_df = df_plot_cgGFAC, x = x_value, palette = palette, title = title)
Error in `geom_bar()` at ]8;line = 173:col = 2;file:///Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/util/function-create-barplot.Rtmb-vaf-longitudinal/util/function-create-barplot.R:173:2]8;;:
! Problem while setting up geom.
ℹ Error occurred in the 1st layer.
Caused by error in `compute_geom_1()`:
! `geom_bar()` requires the following missing aesthetics: x
Backtrace:
1. global create_stacked_barplot_variant_cgGFAC(...)
3. ggplot2:::print.ggplot(...)
5. ggplot2:::ggplot_build.ggplot(x)
6. ggplot2:::by_layer(...)
13. ggplot2 (local) f(l = layers[[i]], d = data[[i]])
14. l$compute_geom_1(d)
15. ggplot2 (local) compute_geom_1(..., self = self)

Alterations per timepoint in each cancer type and timepoint
model
tm_df_plot <- df_plot %>%
filter(!is.na(timepoints_models)) %>%
group_by(tumor_descriptor, cg_id, timepoints_models, Kids_First_Biospecimen_ID, Variant_Classification) %>%
dplyr::count(timepoint_cgGFAC_n)
tm <- as.character(unique(tm_df_plot$timepoints_models))
tm <- sort(tm, decreasing = FALSE)
tm
[1] "Dx-Dec" "Dx-Pro" "Dx-Pro-Dec" "Dx-Pro-Rec" "Dx-Pro-Rec-Dec" "Dx-Rec" "Dx-Rec-Dec" "Dx-SM"
[9] "Pro-Dec" "Pro-Rec" "Pro-Rec-Dec" "Rec-Dec" "Rec-SM"
# Loop through variable
for (i in seq_along(tm)){
print(i)
df_sub <- tm_df_plot %>%
filter(timepoints_models == tm[i])
# Define parameters for function
x_value <- df_sub$Kids_First_Biospecimen_ID
title <- paste(tm[i])
# Run function
p <- create_stacked_barplot_variant_cg_id(count_df = df_sub, x = x_value, palette = palette, title = title)
}
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13













---
title: "Classification of Variants across paired longitudinal samples in the PBTA Cohort"
author: 'Antonia Chroni <chronia@chop.edu> for D3B'
date: "2023"
output:
  html_notebook:
    toc: TRUE
    toc_float: TRUE
---

# Set up
```{r load-library}
suppressPackageStartupMessages({
  library(tidyverse)
})
```

# Directories and File Inputs/Outputs
```{r set-dir-and-file-names}
# Detect the ".git" folder -- this will be in the project root directory
# Use this as the root directory to ensure proper sourcing of functions 
# no matter where this is called from
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
analysis_dir <- file.path(root_dir, "analyses", "tmb-vaf-longitudinal")
results_dir <- file.path(analysis_dir, "results")
input_dir <- file.path(analysis_dir, "input")
files_dir <- file.path(root_dir, "analyses", "sample-distribution-analysis", "results")

# Input files
pbta_file <- file.path(files_dir, "pbta.tsv") # file from add-sample-distribution module
genomic_paired_file <- file.path(files_dir, "genomic_assays_matched_time_points.tsv")
tmb_vaf_file <- file.path(results_dir, "tmb_vaf_genomic.tsv")
palette_file <- file.path(root_dir, "figures", "palettes", "oncoprint_color_palette.tsv")

# File path to plot directory
plots_dir <-
  file.path(analysis_dir, "plots")
if (!dir.exists(plots_dir)) {
  dir.create(plots_dir)
}

source(paste0(root_dir, "/figures/scripts/theme.R"))
source(paste0(analysis_dir, "/util/function-create-barplot.R"))
```

# Read in data and process

```{r load-process-inputs}
pbta_df <- readr::read_tsv(pbta_file, guess_max = 100000, show_col_types = FALSE) %>% 
  select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, cg_multiple, cg_id, cgGFAC, tumor_descriptor)

tmb_vaf_df <- readr::read_tsv(tmb_vaf_file, guess_max = 100000, show_col_types = FALSE) %>% 
  filter(!tmb >= 10) %>% 
  select(Kids_First_Biospecimen_ID, Variant_Classification, gene_protein, mutation_count,	region_size, tmb, VAF)

genomic_paired_df <- readr::read_tsv(genomic_paired_file, guess_max = 100000, show_col_types = FALSE) %>%
  left_join(pbta_df, by = c("Kids_First_Participant_ID")) %>% 
  left_join(tmb_vaf_df, by = c("Kids_First_Biospecimen_ID")) %>%
  filter(!is.na(tmb))

# Attention as some bs specimen might not have TMB!
# If that happens, we will end up with samples lacking timepoints.

# Which patient samples don't have TMB?
# genomic_paired_df %>% 
#  filter(is.na(tmb)) %>% 
#  unique() %>% 
#  regulartable() %>%
#  fontsize(size = 12, part = "all")

descriptors_df <- genomic_paired_df %>%
  group_by(Kids_First_Participant_ID) %>%
  summarize(descriptors = paste(sort(tumor_descriptor), collapse = ", "),) 

# Vector to order timepoints
timepoints <- c("Diagnosis", "Progressive", "Recurrence", "Deceased", "Second Malignancy", "Unavailable")

# Create df to use for plots
df_plot <- genomic_paired_df %>% 
  left_join(descriptors_df, by = c("Kids_First_Participant_ID", "descriptors")) %>% 
  mutate(td_cgGFAC = case_when(grepl("Deceased", tumor_descriptor) ~ "xDeceased",
                               TRUE ~ tumor_descriptor),
         match_id = paste(tumor_descriptor, Kids_First_Participant_ID, sep = "_"),
         tumor_descriptor = factor(tumor_descriptor),
         tumor_descriptor = fct_relevel(tumor_descriptor, timepoints))

# Let's count number of samples 
count_df <- df_plot %>% 
  group_by(tumor_descriptor, cg_id, match_id, Variant_Classification) %>% 
  dplyr::count(cg_id) 

``` 

# Define parameters for plots

```{r define-parameters-for-plots}
# Read color palette
palette_df <- readr::read_tsv(palette_file, guess_max = 100000, show_col_types = FALSE) 

# Define and order palette
palette <- palette_df$hex_codes
names(palette) <- palette_df$color_names
```

# Alterations per timepoint

```{r plot-timepoint, fig.width = 6, fig.height = 6, fig.fullwidth = TRUE}
# Define parameters for function
x_value <- count_df$tumor_descriptor
title <- paste("Variant types in PBTA cohort", sep = " ")

# Run function
fname <- paste0(plots_dir, "/", "Alteration_type_timepoints_barplots.pdf")
print(fname)
p <- create_stacked_barplot_variant(count_df = count_df, x = x_value, palette = palette, title = title)
pdf(file = fname, width = 6, height = 6)
print(p)
dev.off()
```

# Alterations per timepoint in each cancer type

```{r plot-cg-id, fig.width = 30, fig.height = 18, fig.fullwidth = TRUE}
cg_id_id <- as.character(unique(count_df$cg_id)) 
cg_id_id <- sort(cg_id_id, decreasing = FALSE)
cg_id_id

# Define parameters for function
x_value <- count_df$match_id
title <- paste("Variant types in PBTA cohort across cancer groups", sep = " ")

# Run function
fname <- paste0(plots_dir, "/", "Alteration_type_timepoints_cg_id_barplots.pdf")
print(fname)
p <- create_stacked_barplot_variant_cg_id(count_df = count_df, x = x_value, palette = palette, title = title)
pdf(file = fname, width = 30, height = 18)
print(p)
dev.off()

```

# Alterations per timepoint in each cancer type defined by cgGFAC

```{r plot-cgGFAC-n-individual-plots, fig.width = 5, fig.height = 5, fig.fullwidth = TRUE}
df_plot_cgGFAC <- df_plot %>%
  group_by(tumor_descriptor, cgGFAC, match_id, Variant_Classification) %>% 
  dplyr::count(cgGFAC)

cgGFAC_id <- as.character(unique(df_plot_cgGFAC$cgGFAC)) 
cgGFAC_id <- sort(cgGFAC_id, decreasing = FALSE)
cgGFAC_id

# Define parameters for function
x_value <- df_plot_cgGFAC$match_id
title <- paste("Variant types in PBTA cohort across cgGFAC", sep = " ")

# Run function
p <- create_stacked_barplot_variant_cgGFAC(count_df = df_plot_cgGFAC, x = x_value, palette = palette, title = title)

```

# Alterations per timepoint in each cancer type and timepoint model

```{r plot-timepoint-model, fig.width = 15, fig.height = 12, fig.fullwidth = TRUE}
tm_df_plot <- df_plot %>%
  filter(!is.na(timepoints_models)) %>% 
  group_by(tumor_descriptor, cg_id, timepoints_models, Kids_First_Biospecimen_ID, Variant_Classification) %>% 
  dplyr::count(timepoint_cgGFAC_n)

tm <- as.character(unique(tm_df_plot$timepoints_models))
tm <- sort(tm, decreasing = FALSE)
tm

# Loop through variable
for (i in seq_along(tm)){
  print(i)
  df_sub <- tm_df_plot %>%
      filter(timepoints_models == tm[i])
  
   # Define parameters for function
  x_value <- df_sub$match_id
  title <- paste(tm[i])
  
  # Run function
  p <- create_stacked_barplot_variant_cg_id(count_df = df_sub, x = x_value, palette = palette, title = title)
  
}
```


```{r echo=TRUE}
sessionInfo()
```
